# -*- coding:utf8 -*-
# !/usr/bin/env python

'''
#全国企业信用信息公示系统（甘肃）
#维护黄羽
甘肃的返回数据比较慢，有两次请求需要手动延时
'''

import re
import urllib2
from bs4 import BeautifulSoup
from utils import kill_captcha
from scpy.request_util import *
import urllib
# from request_util import RequestUtil as REQ
from parse_util.parse_basesic import parse_basesic
from scpy.logger import get_logger
from table import *
import time
import traceback
import requests

logger = get_logger(__file__)

# 网站需要延时访问
WEBTIMESLEEP = 4


def download_captcha_kill(company, province):
    """
    这个网站不用验证码，验证码的来源在cookie里面
    :param company:
    :param province:
    :return:返回的结果，最多3个soup对象，result_soup_list[0]是公司基本信息， result_soup_list[1:2]是公司年报信息
    """
    # 返回的结果的list
    result_soup_list = []
    if province is not "gs" or not company:
        logger.error('输入的省份错误或公司不存在,你当前输入为,省份：%s,公司或关键字：%s' % (province, companyName))
        return None
    else:
        pass
    # get_captcha_request = RequestUtil()#不启用代理ip
    # 下载验证码
    img_url = r'http://xygs.gsaic.gov.cn/gsxygs/securitycode.jpg'
    get_captcha_headers = {
        'Accept': 'image/webp,image/*,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Host': 'xygs.gsaic.gov.cn',
        'Referer': 'http://xygs.gsaic.gov.cn/gsxygs/main.jsp',
    }
    # get_captcha_request.set_hreaders(get_captcha_headers)
    # get_captcha_request = REQ(headers=get_captcha_headers)#不启用代理ip
    req_session = requests.session()
    req_session.headers = get_captcha_headers  # 不启用代理ip
    try:
        # img_response = get_captcha_request.make_request(img_url, timeout=80)
        img_response = req_session.get(img_url, timeout=80)
    except Exception, e:
        logger.error(e)
        raise e
    set_cookie = img_response.headers['set-cookie']

    # 网站需要延时访问
    time.sleep(WEBTIMESLEEP)
    session_authcode = re.compile('.*?session_authcode=(\d+)').findall(set_cookie)[0]
    jsessionid = re.compile('.*?JSESSIONID=(.+)?;').findall(set_cookie)
    if jsessionid:
        jsessionid = jsessionid[0]
    else:  # JSESSIONID获取失败
        logger.error('JSESSIONID获取失败!')
        req_session.close()
        time.sleep(WEBTIMESLEEP * 10)
        return ''
    logger.info('当前验证码为:%s' % session_authcode)

    if len(session_authcode) > 100:  # 判断服务期返回的验证码是否正确
        logger.error('从response中获取的验证码方法失效')
        # 返回空字符串，用于重复破解
        return ''
    else:
        pass
    # 获取搜索列表
    company_cookie = 'JSESSIONID=' + jsessionid + ';session_authcode=' + session_authcode
    search_company_reg_info_headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Languagehttp': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Content-Length': '259',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Cookie': company_cookie,
        'Origin': 'http://xygs.gsaic.gov.cn',
        'Host': 'xygs.gsaic.gov.cn',
        'Referer': 'http://xygs.gsaic.gov.cn/gsxygs/main.jsp',
        'Upgrade-Insecure-Requests': '1',
    }
    # search_company_reg_info_request_util = get_captcha_request
    search_company_reg_info_request_util = RequestUtil()
    search_company_reg_info_request_util.set_hreaders(search_company_reg_info_headers, is_random_ua=False)
    ssearch_company_reg_info_dict = {
        'browse': '',
        'loginName': '请输入注册号点击搜索',
        'cerNo': '',
        'authCode': '',
        'authCodeQuery': session_authcode,
        'queryVal': company,
    }
    search_company_reg_info_url_str = 'http://xygs.gsaic.gov.cn/gsxygs/pub!list.do'
    try:
        search_company_reg_info_res = search_company_reg_info_request_util.make_request(
            search_company_reg_info_url_str,
            method='post',
            data=ssearch_company_reg_info_dict, timeout=80).content
    except Exception, e:
        logger.error(e)
        raise e

    # 判断公司是否存在
    if re.findall('您输入的查询条件有误，请重新输入查询条件', search_company_reg_info_res):
        logger.info('搜索的公司不存在')
        return None
    else:
        logger.info('搜索的公司存在!')

    search_company_reg_info_soup = BeautifulSoup(search_company_reg_info_res, 'html5lib')
    search_company = search_company_reg_info_soup.find_all('div', {'class': 'list'})

    if not search_company:
        logger.info('没有获取到公司列表,需要重复获取！')
        return ''
    else:
        pass
    reg_no = re.compile('.*id="(\d+)"').findall(str(search_company[0]))[0]
    logger.info("获取公司存在,其工商号为:%s", reg_no)
    get_company_info_url = 'http://xygs.gsaic.gov.cn/gsxygs/pub!view.do'

    # 获取公司基本信息的地址
    get_company_info_headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Cookie': company_cookie,
        'Origin': 'http://xygs.gsaic.gov.cn',
        'Referer': 'http://xygs.gsaic.gov.cn/gsxygs/pub!list.do',
        'Host': 'gxqyxygs.gov.cn',
        'Upgrade-Insecure-Requests': '1',
    }
    data = {
        'regno': reg_no,
        'entcate': 'compan',
    }
    # get_company_info_request_util = search_company_reg_info_request_util
    get_company_info_request_util = RequestUtil()
    get_company_info_request_util.set_hreaders(get_company_info_headers, is_random_ua=False)
    try:
        get_company_info_res = get_company_info_request_util.make_request(get_company_info_url, method='post',
                                                                          data=data, timeout=80).content
    except Exception, e:
        logger.error(e)
        raise e
    get_company_info_soup = BeautifulSoup(get_company_info_res, 'html5lib')
    result_soup_list.append(get_company_info_soup)
    logger.info("下载工商信息网页成功！")

    # 下载年报部分
    # 设置年报的请求头,再次下载一张图片，获取session_authcode,用于设置年报的头，必须
    year_request = RequestUtil()
    # year_request = get_captcha_request
    img2_url = img_url
    img2_res = year_request.make_request(img2_url, timeout=80)
    img2_set_cookie = img2_res.headers['set-cookie']

    img2_session_authcode = re.compile('.*?session_authcode=(\d+)').findall(img2_set_cookie)[0]
    company_cookie = 'JSESSIONID=' + jsessionid + ';session_authcode=' + img2_session_authcode
    get_year_report = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Content-Length': '58',
        'Cookie': company_cookie,
        'Content-Type': 'application/x-www-form-urlencoded',
        'Origin': 'http://xygs.gsaic.gov.cn',
        'Referer': 'http://xygs.gsaic.gov.cn/gsxygs/pub!view.do',
        'Host': 'gxqyxygs.gov.cn',
        'Upgrade-Insecure-Requests': '1',
    }
    li_tag = get_company_info_soup.find_all(id='leftTabs')[0].find_all('li')[1]
    onclick = li_tag['onclick']
    togo_str = re.compile('togo\(?(.*)?\)').findall(str(onclick))[0]
    togo_list = togo_str.replace("'", "").split(',')

    pripid = togo_list[1]
    entcate = togo_list[3]
    data = {
        'regno': reg_no,
        'entcate': entcate,
        'pripid': pripid,
    }
    # get_year_report_request_util = get_company_info_request_util
    get_year_report_request_util = RequestUtil()

    get_year_report_request_util.set_hreaders(get_year_report, is_random_ua=False)
    # get_company_info_request_util.set_hreaders(get_year_report, is_random_ua=True)
    nblist_url = 'http://xygs.gsaic.gov.cn/gsxygs/pub!viewE.do'
    try:
        get_company_year_list = get_year_report_request_util.make_request(nblist_url, method='post', data=data,
                                                                          timeout=80).content
    except Exception, e:
        logger.error(e)
        raise e

    get_company_year_list_soup = BeautifulSoup(get_company_year_list, 'html5lib')
    year_list = get_company_year_list_soup.find_all(id="qiyenianbao")[0]
    a_tag_list = year_list.find_all('a')
    root_url = 'http://xygs.gsaic.gov.cn'

    logger.info("下载年报开始！")
    for a_tag in a_tag_list:
        year_report_soup_dict = {}
        a_onclick = a_tag['onclick']
        a_year_url_list = re.compile('window.open\(\'?(.*)\'\)').findall(a_onclick)
        if a_year_url_list:
            year = re.findall('(\d+)', a_tag.getText())[0]
            a_year_url = root_url + a_year_url_list[0]
            # 必须延时，至少10s
            time.sleep(3 * WEBTIMESLEEP)
            try:
                year_report_html = get_year_report_request_util.make_request(a_year_url, method='get',
                                                                             timeout=80).content
            except Exception, e:
                logger.exception(e)
                raise e
            year_report_soup = BeautifulSoup(year_report_html, 'html5lib')
            year_report_soup_dict['soup'] = year_report_soup
            year_report_soup_dict['year'] = year
            logger.info("%s年报下载成功！", year)
            result_soup_list.append(year_report_soup_dict)
    return result_soup_list


def get_company_info(result_soup_list):
    """
    解析基本信息
    :param searchCompany:首页的网页
    :return:公司字典
    """
    if not result_soup_list:
        return None

    logger.info("开始解析工商信息！")
    company_info_dict = {}

    company_info_dict['shareHolderList'] = []
    company_info_dict['province'] = 'gs'

    company_info_dict['punishBreakList'] = []
    company_info_dict['punishedList'] = []
    company_info_dict['alidebtList'] = []

    company_info_dict['entinvItemList'] = []

    company_info_dict['frinvList'] = []
    company_info_dict['frPositionList'] = []
    company_info_dict['filiationList'] = []
    company_info_dict['caseInfoList'] = []
    company_info_dict['sharesFrostList'] = []
    company_info_dict['sharesImpawnList'] = []
    company_info_dict['morDetailList'] = []
    company_info_dict['morguaInfoList'] = []
    company_info_dict['liquidationList'] = []

    company_info_soup = result_soup_list[0]
    jibenxinxi = company_info_soup.find_all(id='jibenxinxi')[0]

    # 获取登记信息中的基本信息
    jibenxinxi_table_list = jibenxinxi.find_all('table')
    basic_list = jibenxinxi_table_list[0] if jibenxinxi_table_list else []

    # print getCompanyInfoUrl_soup
    basic_list = parse_basesic(basic_list) if basic_list else []
    if basic_list and isinstance(basic_list, list):
        company_info_dict['basicList'] = basic_list
    else:
        company_info_dict['basicList'] = []

    shareHolderList = jibenxinxi.find_all(id='invTab')
    if shareHolderList:
        shareHolderList = index('股东信息', str(shareHolderList[0]))
        for item in shareHolderList:
            item.pop('shareHolderdetail')
            item['country'] = ''
            item['subConam'] = ''
            item['regCapCur'] = ''
            item['conDate'] = ''
            item['fundedRatio'] = ''
            # item['funded'] = ''
        company_info_dict['shareHolderList'] = shareHolderList

    # 获取变更信息
    from parse_util.parse_alter import parse as parse_alter

    changTab = jibenxinxi.find_all(id='changTab')
    alterList = parse_alter(changTab[0]) if changTab else []
    company_info_dict['alterList'] = alterList if alterList else []

    # 第二页 备案信息
    # 主要人员信息
    perTab_tag = company_info_soup.find_all(id='perTab')
    if perTab_tag:
        perTab = perTab_tag[0]
        company_info_dict['personList'] = index('主要人员信息', str(perTab))

    # beian = company_info_soup.find_all(id='beian')[0]

    # companyInfo_dict['personList'] = personList(str(perTab))

    # 分支机构信息
    branTab = company_info_soup.find_all(id='branTab')
    company_info_dict['filiationList'] = index('分支机构信息', str(branTab[0])) if branTab else []

    # 清算
    liquidationList = company_info_soup.find_all('table', id='t32')
    company_info_dict['liquidationList'] = index('清算信息', str(liquidationList[0])) if liquidationList else []

    # # 动产抵押登记信息 暂缺
    # moveTab = getCompanyInfoUrl_soup.find_all(id='moveTab')
    # if moveTab:
    #     companyInfo_dict['morguaInfoList'] = index('动产抵押登记信息', str(moveTab[0]))
    company_info_dict['morguaInfoList'] = []
    #
    # # 股权出质登记信息 暂缺
    # guquanchuzhi = getCompanyInfoUrl_soup.find_all(id='guquanchuzhi')
    # if guquanchuzhi:
    company_info_dict['morguaInfoList'] = []

    # 经营异常信息
    excDiv = company_info_soup.find_all(id='excpTab')
    company_info_dict['abnormalOperation'] = index('经营异常信息', str(excDiv[0])) if excDiv else []

    # 抽查检查信息
    tableChoucha = company_info_soup.find_all(id='tableChoucha')
    company_info_dict['checkMessage'] = index('抽查检查信息', '<tr></tr>' * 2 + str(tableChoucha[0])) if tableChoucha else []

    logger.info("解析工商信息解析完成！")
    return company_info_dict


def get_year_report(result_soup_list):
    """
    解析年报
    """
    if not result_soup_list:
        return None
    if not len(result_soup_list) > 1:
        return []

    logger.info("开始解析年报！")
    # company_info_dict = {}
    company_year_report_list = []
    year_report_soup_list = result_soup_list[1:]
    for year_report_soup_dict in year_report_soup_list:

        '''
        # 年报里面的股东信息、股权变更信息、知识产权出资登记信息等都没有信息，点击按钮也不发送请求
        '''
        year_report_soup = year_report_soup_dict.get('soup')
        qufenkuang_list = year_report_soup.find_all(id='qufenkuang')

        if qufenkuang_list:
            company_year_report_dict = {}
            year = year_report_soup_dict.get('year')
            table_list = qufenkuang_list[0].find_all('table')

            # 年报的 企业基本信息
            company_year_report_dict['baseInfo'] = report_index('企业基本信息', str(table_list[0])) if table_list else {}

            # 年报的 网站或网店信息
            siteTab = qufenkuang_list[0].find_all(id='siteTab')
            company_year_report_dict['website'] = report_index('网站或网店信息', str(siteTab[0])) if siteTab else {}

            # 年报的 股东及出资信息
            invTab = qufenkuang_list[0].find_all(id='invTab')
            company_year_report_dict['investorInformations'] = report_index('股东及出资信息', str(invTab[0])) if invTab else []

            # 年报的 对外投资信息
            invoutTab = qufenkuang_list[0].find_all(id='invoutTab')
            # company_year_report_dict[''] = report_index('对外投资信息',str(invoutTab[2]))

            # 年报的 企业资产状况信息
            asset = qufenkuang_list[0].find_all(id='asset')[0].parent.parent
            company_year_report_dict['assetsInfo'] = report_index('企业资产状况信息', str(asset)) if asset else {}

            # 年报的 股权变更信息
            transTab = qufenkuang_list[0].find_all(id='transTab')
            company_year_report_dict['equityChangeInformations'] = report_index('股权变更信息',
                                                                                str(transTab[0])) if transTab else []

            # 年报的 修改记录
            modTab = qufenkuang_list[1].find_all(id='modTab')
            company_year_report_dict['changeRecords'] = report_index('修改记录', str(modTab[0])) if modTab else []

            company_year_report_dict['year'] = year
            logger.info("%s年报完成！", year)
            company_year_report_list.append(company_year_report_dict)

    return company_year_report_list


def search0(companyName):
    """
    甘肃：gs
    :param companyName:公司名字或注册号
    :return:若公司存在返回公司信息
            若不存在返回None
    """
    province = 'gs'
    MAXTIME = 20
    atime = MAXTIME
    result_soup_list = ''
    # company_info_dict = {}

    while atime > 0 and (result_soup_list == ''):
        try:
            result_soup_list = download_captcha_kill(companyName, province)
        except Exception, e:
            traceback.print_exc(e)
            logger.info(e)
            raise e

        # 破解验证码的错误需要重复破解验证码
        if result_soup_list == '':
            atime -= 1
            logger.info("没有获取到公司列表,需要重复获取！当前设定次数为:%s ,剩余次数为:%s" % (MAXTIME, atime))
            continue

        # 公司不存在或输入关键字错误
        elif result_soup_list == None:
            break
        else:
            logger.info("验证码破解成功！")

    # 解析网页
    try:
        if result_soup_list:
            raw_html_dict = {}

            company_info_dict = get_company_info(result_soup_list)
            company_year_report_list = get_year_report(result_soup_list)
            company_info_dict['yearReportList'] = company_year_report_list

            raw_html_dict['html'] = str(result_soup_list[0])
            raw_html_dict['yearList'] = [{item['year']: str(item['soup'])} for item in result_soup_list[1:]] if len(
                result_soup_list) > 1 else []
            raw_html_dict['type'] = '2'
            raw_html_dict['province'] = 'gs'
            raw_html_dict['json'] = ''
            raw_html_dict['keyword'] = companyName

            if 'basicList' in company_info_dict and company_info_dict['basicList'][0]:
                raw_html_dict['companyName'] = company_info_dict['basicList'][0].get('enterpriseName', '')
            else:
                raw_html_dict['companyName'] = ''

            logger.info("完成！")
            return raw_html_dict, company_info_dict
        else:
            return None
    except Exception, e:
        traceback.print_exc(e)
        raise e


def search2(companyName):
    TRY_TIME = 2
    a_time = TRY_TIME
    while a_time > 0:
        try:
            res = search0(companyName)
            if not res:
                return None
            else:
                return res
        except Exception, e:
            traceback.print_exc(e)
            a_time -= 1
            continue
    if a_time < 1:
        raise Exception("尝试次数达到上限!")


def search(companyName):
    res = search2(companyName)
    if not res:
        return None

    return res[1]


if __name__ == "__main__":
    # companyName = u'定西巨安商贸有限责任公司'
    # companyName = u'兰州民百（集团）股份有限公司亚欧餐饮分公司'
    companyName = u'甘肃中牧山丹马场总场四场'
    # companyName = u'甘肃兰生生物药业有限公司'
    # companyName = u'甘肃电投能源发展股份有限公司'
    # companyName = u'兰州民百（集团）股份有限公司'
    # companyName = u'甘肃天庆房地产经纪有限公司'
    # province = 'gx'
    result = search2(companyName)
    import json

    print json.dumps(result, indent=4, ensure_ascii=False)
